{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# LIBRARIES IMPORT ------------------------------------------------\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import datetime\n",
    "from sklearn import preprocessing\n",
    "from sklearn.metrics import mean_squared_error\n",
    "from sklearn.model_selection import train_test_split\n",
    "from keras.models import Model\n",
    "#from keras.layers import Input, Dense, LSTM, TimeDistributed, CuDNNLSTM\n",
    "from keras.optimizers import SGD\n",
    "from keras.callbacks import EarlyStopping, ModelCheckpoint\n",
    "from keras import backend as K\n",
    "\n",
    "air_reserve = pd.read_csv('air_reserve.csv')\n",
    "air_store_info = pd.read_csv('air_store_info.csv')\n",
    "air_visit_data = pd.read_csv('air_visit_data.csv')\n",
    "hpg_reserve = pd.read_csv('hpg_reserve.csv')\n",
    "hpg_store_info = pd.read_csv('hpg_store_info.csv')\n",
    "store_id_relation = pd.read_csv('store_id_relation.csv')\n",
    "date_info = pd.read_csv('date_info.csv')\n",
    "sample_sub = pd.read_csv('sample_submission.csv')\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# DATA PREPARATION =====================================================\n",
    "# TEST DATA ------------------------------------------------------------\n",
    "air_test = sample_sub.copy()\n",
    "air_test['air_store_id'] = air_test['id'].apply(lambda x: str(x)[:-11])\n",
    "air_test['visit_date'] = air_test['id'].apply(lambda x: str(x)[-10:])\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# dataframe for predictions\n",
    "submission_LSTM = air_test.copy()\n",
    "\n",
    "# test set for merger with train set\n",
    "air_test = air_test.drop(['id', 'visitors'], axis = 1)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# DATA ON RESERVATION --------------------------------------------------\n",
    "# combine air and hpg databases\n",
    "hpg_air_reserve = store_id_relation.join(hpg_reserve.set_index('hpg_store_id'), on = 'hpg_store_id')\n",
    "air_reserve_tmp = air_reserve.copy()\n",
    "hpg_air_reserve = hpg_air_reserve.drop('hpg_store_id', axis = 1)\n",
    "reserve = pd.concat([air_reserve_tmp, hpg_air_reserve])\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# convert columns of \"reserve\" table into datetime format\n",
    "reserve['visit_datetime'] =  pd.to_datetime(reserve['visit_datetime'])\n",
    "reserve['reserve_datetime'] =  pd.to_datetime(reserve['reserve_datetime'])\n",
    "\n",
    "# create column for visit date inside \"reserve\" table\n",
    "reserve['visit_date'] = reserve['visit_datetime'].apply(lambda x: str(x)[0:10])\n",
    "\n",
    "# calculate the gap between visit time and reservation time inside \"reserve\" table\n",
    "reserve['hour_gap'] = reserve['visit_datetime'].sub(reserve['reserve_datetime'])\n",
    "reserve['hour_gap'] = reserve['hour_gap'].apply(lambda x: x/np.timedelta64(1,'h'))\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# separate reservation into 5 categories based on gap lenght\n",
    "reserve['reserve_-12_h'] = np.where(reserve['hour_gap'] <= 12,\n",
    "                                    reserve['reserve_visitors'], 0)\n",
    "reserve['reserve_12_37_h'] = np.where((reserve['hour_gap'] <= 37) & (reserve['hour_gap'] > 12),\n",
    "                                       reserve['reserve_visitors'], 0)\n",
    "reserve['reserve_37_59_h'] = np.where((reserve['hour_gap'] <= 59) & (reserve['hour_gap'] > 37),\n",
    "                                       reserve['reserve_visitors'], 0)\n",
    "reserve['reserve_59_85_h'] = np.where((reserve['hour_gap'] <= 85) & (reserve['hour_gap'] > 59),\n",
    "                                       reserve['reserve_visitors'], 0)\n",
    "reserve['reserve_85+_h'] = np.where((reserve['hour_gap'] > 85),\n",
    "                                     reserve['reserve_visitors'], 0)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# group by air_store_id and visit_date to enable joining with main table\n",
    "group_list = ['air_store_id', 'visit_date', 'reserve_visitors', 'reserve_-12_h',\n",
    "              'reserve_12_37_h', 'reserve_37_59_h', 'reserve_59_85_h', 'reserve_85+_h']\n",
    "reserve = reserve[group_list].groupby(['air_store_id', 'visit_date'], as_index = False).sum()\n",
    "\n",
    "for i in group_list[2:]:\n",
    "    reserve[i] = reserve[i].apply(lambda x: np.log1p(x))\n",
    "\n",
    "# GENRE DATA ----------------------------------------------------------\n",
    "# total amount of restaurants of specific genres by area_name\n",
    "air_genres_area = air_store_info.copy()\n",
    "air_genres_area = air_genres_area[['air_store_id', 'air_genre_name', 'air_area_name']].groupby(['air_genre_name', 'air_area_name'],\n",
    "                                                                                              as_index = False).count()\n",
    "air_genres_area = air_genres_area.rename(columns = {'air_store_id': 'genre_in_area'})\n",
    "\n",
    "# total amount of restaurants in area\n",
    "air_area = air_store_info.copy()\n",
    "air_area = air_area[['air_store_id', 'air_area_name']].groupby(['air_area_name'], as_index = False).count()\n",
    "air_area = air_area.rename(columns = {'air_store_id': 'total_r_in_area'})\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/ankitgupta/anaconda/lib/python2.7/site-packages/pandas/core/indexing.py:179: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  self._setitem_with_indexer(indexer, value)\n"
     ]
    }
   ],
   "source": [
    "# WEEKEND AND HOLIDAYS ------------------------------------------------\n",
    "# additional features for weekends and holidays\n",
    "date_info_mod = date_info.copy()\n",
    "date_info_mod['holiday_eve'] = np.zeros(date_info_mod.shape[0])\n",
    "date_info_mod['holiday_eve'].iloc[:-1] = date_info_mod['holiday_flg'].copy().values[1:]\n",
    "date_info_mod['non_working'] = np.where(date_info_mod['day_of_week'].isin(['Saturday', 'Sunday']) |\n",
    "                                        date_info_mod['holiday_flg'] == 1, 1, 0)\n",
    "date_info_mod = date_info_mod.drop('holiday_flg', axis = 1)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "# average visitors per restaurant by working and non-working days\n",
    "air_visit_wd = air_visit_data.join(date_info_mod.set_index('calendar_date'), on = 'visit_date')\n",
    "air_visit_wd['visitors'] = air_visit_wd['visitors'].apply(lambda x: np.log1p(x)) \n",
    "\n",
    "# average visitors per restaurant\n",
    "mean_df = air_visit_wd[['visitors',\n",
    "                        'air_store_id',\n",
    "                        'non_working']].copy().groupby(['air_store_id',\n",
    "                                                        'non_working'],\n",
    "                                                        as_index = False).mean()\n",
    "mean_df = mean_df.rename(columns = {'visitors': 'visitors_mean'})\n",
    "\n",
    "\n",
    "# median visitors per restaurant\n",
    "median_df = air_visit_wd[['visitors',\n",
    "                          'air_store_id',\n",
    "                          'non_working']].copy().groupby(['air_store_id',\n",
    "                                                          'non_working'],\n",
    "                                                          as_index = False).median()\n",
    "median_df = median_df.rename(columns = {'visitors': 'visitors_median'})\n",
    "\n",
    "# max visitors per restaurant\n",
    "max_df = air_visit_wd[['visitors',\n",
    "                       'air_store_id',\n",
    "                       'non_working']].copy().groupby(['air_store_id',\n",
    "                                                       'non_working'],\n",
    "                                                       as_index = False).max()\n",
    "max_df = max_df.rename(columns = {'visitors': 'visitors_max'})\n",
    "\n",
    "# min visitors per restaurant\n",
    "min_df = air_visit_wd[['visitors',\n",
    "                       'air_store_id',\n",
    "                       'non_working']].copy().groupby(['air_store_id',\n",
    "                                                       'non_working'],\n",
    "                                                       as_index = False).min()\n",
    "min_df = min_df.rename(columns = {'visitors': 'visitors_min'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def merge_join(df):\n",
    "    # add month of visit\n",
    "    df['month'] = df['visit_date'].apply(lambda x: float(str(x)[5:7]))\n",
    "\n",
    "    # add weekday and holiday flag\n",
    "    df = df.join(date_info_mod.set_index('calendar_date'), on = 'visit_date')\n",
    "    # add genre and area name)\n",
    "    df = df.join(air_store_info.set_index('air_store_id'), on = 'air_store_id')\n",
    "    # add quantity of same genre in area\n",
    "    df = pd.merge(df, air_genres_area, how = 'left',\n",
    "                  left_on = ['air_genre_name', 'air_area_name'],\n",
    "                  right_on = ['air_genre_name', 'air_area_name'])\n",
    "    # add total quatity of restaurants in area\n",
    "    df = pd.merge(df, air_area, how = 'left',\n",
    "                  left_on = ['air_area_name'],\n",
    "                  right_on = ['air_area_name'])\n",
    "    # add reservation information\n",
    "    df = pd.merge(df, reserve, how = 'left',\n",
    "                  left_on = ['air_store_id', 'visit_date'],\n",
    "                  right_on = ['air_store_id', 'visit_date'])\n",
    "    # add visitors number mean, median, max and min per each restaurant\n",
    "    df = pd.merge(df, mean_df, how = 'left',\n",
    "                  left_on = ['air_store_id', 'non_working'],\n",
    "                  right_on = ['air_store_id', 'non_working'])\n",
    "    df = pd.merge(df, median_df, how = 'left',\n",
    "                  left_on = ['air_store_id', 'non_working'],\n",
    "                  right_on = ['air_store_id', 'non_working'])\n",
    "    df = pd.merge(df, max_df, how = 'left',\n",
    "                  left_on = ['air_store_id', 'non_working'],\n",
    "                  right_on = ['air_store_id', 'non_working'])\n",
    "    df = pd.merge(df, min_df, how = 'left',\n",
    "                  left_on = ['air_store_id', 'non_working'],\n",
    "                  right_on = ['air_store_id', 'non_working'])\n",
    "    # change NaN to 0\n",
    "    df = df.fillna(0) \n",
    "    return df\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# combine train/test data with additional information\n",
    "air_train = air_visit_data.copy()\n",
    "X = merge_join(air_train)\n",
    "X_test = merge_join(air_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "X['id'] = X['air_store_id'] + str('_') + X['visit_date']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_test['id'] = X_test['air_store_id'] + str('_') + X_test['visit_date']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index([u'air_store_id', u'visit_date', u'visitors', u'month', u'day_of_week',\n",
       "       u'holiday_eve', u'non_working', u'air_genre_name', u'air_area_name',\n",
       "       u'latitude', u'longitude', u'genre_in_area', u'total_r_in_area',\n",
       "       u'reserve_visitors', u'reserve_-12_h', u'reserve_12_37_h',\n",
       "       u'reserve_37_59_h', u'reserve_59_85_h', u'reserve_85+_h',\n",
       "       u'visitors_mean', u'visitors_median', u'visitors_max', u'visitors_min',\n",
       "       u'id'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "k = ['air_store_id', u'visit_date', u'visitors', u'month', u'day_of_week',u'air_genre_name', u'air_area_name',\n",
    "       u'latitude', u'longitude']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = X.drop(k,axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "k = ['air_store_id', u'visit_date',  u'month', u'day_of_week',u'air_genre_name', u'air_area_name',\n",
    "       u'latitude', u'longitude']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_test = X_test.drop(k,axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X.to_csv('new_train.csv',index = False)\n",
    "X_test.to_csv('new_test.csv',index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
